# Goal: Assess relationship between bias and degree of sample selection due to due to omitted attribute
# Dependency: none

library(MatchIt)
library(weights)
library(Hmisc)

## SET SIMULATION PARAMETERS ##

## SET NUMBER OF OBSERVATIONS (N), NUMBER OF SIMULATIONS (J) ##

N_pop = 100000; J = 1000

##--- Strata defined by variable B (not B = A) ---#

PropB_pop <- 0.10 # proportion strata B in population

### Equal probability of selection:
### PropB_sample <- PropB_pop

# Unequal probability of selection 
PropB_sample <- 0.5 # proportion strata B in sample

##--- Strata defined by variable G (not G = F) ---#

PropG_pop <- 0.5 # proportion strata G in population

### Unequal probability of selection 

PropG_sample.vector <- c(0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95)

sim.results.miss <- list()

for (k in 1:length(PropG_sample.vector)) {

print(paste("Simulation #", k, " of ", length(PropG_sample.vector), "; PropG_sample = ", PropG_sample.vector[k], sep = ""))

PropG_sample <- PropG_sample.vector[k] # proportion strata G in sample

##----------------- Sample sizes -----------------#

N_sample <- N_pop / 100

Naf <- N_sample * (1 - PropB_sample) * (1 - PropG_sample)
Nag <- N_sample * (1 - PropB_sample) * PropG_sample
Nbf <- N_sample * PropB_sample * (1 - PropG_sample)
Nbg <- N_sample * PropB_sample * PropG_sample

### assume stratified sampling
### probability of selection

Pa <- (Naf + Nag) / (N_pop * (1 - PropB_pop))
Pb <- (Nbf + Nbg) / (N_pop * PropB_pop)

Pf <- (Naf + Nbf) / (N_pop * (1 - PropG_pop))
Pg <- (Nag + Nbg) / (N_pop * PropG_pop)

Paf <- Naf / (N_pop * (1 - PropB_pop) * (1 - PropG_pop))
Pag <- Nag / (N_pop * (1 - PropB_pop) * PropG_pop)
Pbf <- Nbf / (N_pop * PropB_pop * (1 - PropG_pop))
Pbg <- Nbg / (N_pop * PropB_pop * PropG_pop)

##------ SET PARAMETERS OF OUTCOME MODEL ------##

beta0 <- 1 # Treatment effect
betahb <- 0.75 # Treatment effect, dev strata B
betahg <- -0.5 # Treatment effect, dev strata G

### Average treatment effect:
avg.beta <- beta0 + betahg * PropB_pop + betahg * PropG_pop

alpha0 <- -0.25 ; alpha1 <- 0.60 ; 

alpha2 <- 0.30 # impact of Sb ;

alpha3 <- 0 # we assume that Sg only affects Y through betahg

##------ SET PARAMETERS OF PROPENSITY SCORE MODEL ------##

gamma0 <- -1 ; gamma1 <- 2  ; gamma2 <- 1

gamma3 <- 0 # we assume that Sg does not affect treatment assignment

#--------------------------------------------------#
#-------------- Generate population data ----------#
#--------------------------------------------------#

### GENERATE CONFOUNDERS ###

set.seed(1)

X <- runif(N_pop, min=-1, max=1)

Sb <- c(rep(0, N_pop * (1 - PropB_pop)), rep(1, N_pop * PropB_pop))

Sg <- sample(c(rep(0, N_pop * (1 - PropG_pop)), rep(1, N_pop * PropG_pop))) 

#### COMPUTE TRUE PROPENSITY SCORE ####

invlogit <- function(x) {
  
  invlogit <- 1/(1 + exp(-x))
  
  invlogit
}

LP <- gamma0 + gamma1 * X + gamma2 * Sb + gamma3 * Sg

PS <- invlogit(LP) 

#### GENERATE TREATMENT ASSIGNMENTS BASED ON TRUE PROPENSITY SCORE ####

set.seed(2)

T <- rbinom(N_pop, 1, PS)

#### COMPUTE OUTCOME CONDITIONAL ON TREATMENT ASSIGNMENT AND CONFOUNDER ####

set.seed(3)

Y_error <- rnorm(N_pop, 0, 1)

Y <- alpha0 + alpha1 * X + alpha2 * Sb + alpha3 * Sg + (beta0 + betahb * Sb + betahg * Sg) * T + Y_error

popdata <- data.frame(X, Sb, Sg, T, Y, PS, LP)

#### TRUE AVERAGE TREATMENT EFFECT ON THE TREATED

PropB_treated_pop <- prop.table(table(popdata$T,popdata$Sb), 1)[2,2]
PropG_treated_pop <- prop.table(table(popdata$T,popdata$Sg), 1)[2,2]

avg.beta.treated <- beta0 + betahb * PropB_treated_pop + betahg * PropG_treated_pop

#### SAVE TRUE AVERAGE TREATMENT EFFECTS

true.effect <- list("ATE" = avg.beta, "ATT" = avg.beta.treated)

#--------------------------------------------------#
#------- Generate data for simulation study -------#
#--------------------------------------------------#

data.sims <- NULL

set.seed(4)

for (j in 1:J) { # ITERATE OVER SIMULATIONS

#### STORE DATA USED IN SIMULATION STUDY ####

data.AF.pop <- subset(popdata, Sb == 0 & Sg == 0)
data.AF.samp <- data.AF.pop[sample(1:nrow(data.AF.pop), Naf,replace=FALSE),]

data.AG.pop <- subset(popdata, Sb == 0 & Sg == 1)
data.AG.samp <- data.AG.pop[sample(1:nrow(data.AG.pop), Nag,replace=FALSE),]

data.BF.pop <- subset(popdata, Sb == 1 & Sg == 0)
data.BF.samp <- data.BF.pop[sample(1:nrow(data.BF.pop), Nbf,replace=FALSE),]

data.BG.pop <- subset(popdata, Sb == 1 & Sg == 1)
data.BG.samp <- data.BG.pop[sample(1:nrow(data.BG.pop), Nbg,replace=FALSE),]

data.sims[[j]]<- rbind(data.AF.samp, data.AG.samp, data.BF.samp, data.BG.samp)

### We assume that informaiton about Sg is ignored when computing the weights
data.sims[[j]]$W <- (1 - data.sims[[j]]$Sb) * Pb/Pa + data.sims[[j]]$Sb

### Correct weights:
#data.sims[[j]]$W <- (1 - data.sims[[j]]$Sb) * (1 - data.sims[[j]]$Sg) * Pbg/Paf + 
#  (1 - data.sims[[j]]$Sb) * data.sims[[j]]$Sg * Pbg/Pag + 
#  data.sims[[j]]$Sb * (1 - data.sims[[j]]$Sg) * Pbg/Pbf + 
#  data.sims[[j]]$Sb * data.sims[[j]]$Sg

}

##------ SET PARAMETERS OF MATCHING PROCEDURE ------#  

caliper.size <- 0 # If 0, no caliper matching
n.subclass <- 8
discard.string <-"control"

vATE.naive <- NULL
vATE.wnaive <- NULL
vATT.weight <- NULL
vATT.wweight <- NULL
vATT.weight.reg <- NULL
vATT.wweight.reg <- NULL
vATT.nn <- NULL
vATT.wnn <- NULL
vATT.nn.reg <- NULL
vATT.wnn.reg <- NULL
vATT.s <- NULL
vATT.s.reg <- NULL
vATT.ws <- NULL
vATT.ws.reg <- NULL

for (j in 1:J) { # ITERATE OVER SIMULATIONS
  
#-----------------------------#
#------- Naive ATE  ----------#
#-----------------------------#
  
# Without weights:
  
ATE.naive <- wtd.t.test(data.sims[[j]]$Y[data.sims[[j]]$T == 1], data.sims[[j]]$Y[data.sims[[j]]$T == 0], weight = rep(1, sum(data.sims[[j]]$T)), weighty = rep(1, sum(1 - data.sims[[j]]$T)), alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

# With weights:

ATE.wnaive <- wtd.t.test(data.sims[[j]]$Y[data.sims[[j]]$T == 1], data.sims[[j]]$Y[data.sims[[j]]$T == 0], weight = data.sims[[j]]$W[data.sims[[j]]$T == 1], weighty = data.sims[[j]]$W[data.sims[[j]]$T == 0], alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

# Store naive estimates of treatment effects

vATE.naive <- rbind(vATE.naive, ATE.naive)
vATE.wnaive <- rbind(vATE.wnaive, ATE.wnaive)

#------------------------------------------#
#------- Propensity Score Model  ----------#
#------------------------------------------#

# Assume Sb not observed, ignore survey weights

z.out1.ps <- glm(T ~ X,data = data.sims[[j]], family=binomial(link="logit"))

PS.mle1 <-z.out1.ps$fitted.values
LP.mle1 <- log(PS.mle1/(1-PS.mle1))

# Assume Sb not observed, control for survey weights instead (function of Sb)

z.out2.ps <- glm(T ~ X + W,data = data.sims[[j]], family=binomial(link="logit"))

PS.mle2 <-z.out2.ps$fitted.values
LP.mle2 <- log(PS.mle2/(1-PS.mle2))

#--------------------------------------------#
#------- Propensity Score Weighting----------#
#--------------------------------------------#

# Propensity score weights

data.sims[[j]][data.sims[[j]]$T == 1,"PS.weight1"] <- 1
data.sims[[j]][data.sims[[j]]$T == 0,"PS.weight1"] <- PS.mle1[data.sims[[j]]$T == 0]/ (1 - PS.mle1[data.sims[[j]]$T == 0])

data.sims[[j]][data.sims[[j]]$T == 1,"PS.weight2"] <- 1
data.sims[[j]][data.sims[[j]]$T == 0,"PS.weight2"] <- PS.mle2[data.sims[[j]]$T == 0]/ (1 - PS.mle2[data.sims[[j]]$T == 0])

# Treatment effects, without survey weights

 ## Without regression adjustment

ATT.weight <- wtd.t.test(data.sims[[j]]$Y[data.sims[[j]]$T == 1], data.sims[[j]]$Y[data.sims[[j]]$T == 0], weight = data.sims[[j]]$PS.weight1[data.sims[[j]]$T == 1], weighty = data.sims[[j]]$PS.weight1[data.sims[[j]]$T == 0], alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

 ## With regression adjustment

ATT.weight.reg <- summary(lm(Y ~ T + X, data = data.sims[[j]], weights = PS.weight1))$coefficients[2,c(1,2)]

 ## Store results of ps weighting w/o survey weights

vATT.weight <- rbind(vATT.weight, ATT.weight)
vATT.weight.reg <- rbind(vATT.weight.reg, ATT.weight.reg)

# Treatment effects, with survey weights

data.sims[[j]][,"adj.PS.weight"] <- data.sims[[j]]$PS.weight2 * data.sims[[j]]$W

 ## Without regression adjustment

ATT.wweight <- wtd.t.test(data.sims[[j]]$Y[data.sims[[j]]$T == 1], data.sims[[j]]$Y[data.sims[[j]]$T == 0], weight = data.sims[[j]]$adj.PS.weight[data.sims[[j]]$T == 1], weighty = data.sims[[j]]$adj.PS.weight[data.sims[[j]]$T == 0], alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

 ## With regression adjustment

ATT.wweight.reg <- summary(lm(Y ~ T + X, data = data.sims[[j]], weights = adj.PS.weight))$coefficients[2,c(1,2)]

 ## Store results of ps weighting w/ survey weights

vATT.wweight <- rbind(vATT.wweight, ATT.wweight)
vATT.wweight.reg <- rbind(vATT.wweight.reg, ATT.wweight.reg)

#------------------------------------------#
#------- Nearest Neighbor Matching  -------#
#------------------------------------------#

# Matching

 ## Without survey weights

set.seed(1000)

m.out1<-matchit(T ~ X,distance="logit",data=data.sims[[j]],method="nearest",caliper=caliper.size,m.order="largest",discard=discard.string, ratio = 2, replace = TRUE)

matched.data1 <- match.data(m.out1)

 ## With survey weights 

set.seed(1000)

m.out2<-matchit(T ~ X + W,distance="logit",data=data.sims[[j]],method="nearest",caliper=caliper.size,m.order="largest",discard=discard.string, ratio = 2, replace = TRUE)

matched.data2 <- rbind(data.sims[[j]][rownames(m.out2$match.matrix),],
                       data.sims[[j]][m.out2$match.matrix[,1],],
                       data.sims[[j]][m.out2$match.matrix[,2],])
matched.data2$W.matched.treated <- rep(data.sims[[j]][rownames(m.out2$match.matrix),]$W,3)
matched.data2$weights <- match.data(m.out2)[rownames(matched.data2),]$weights

# Treatment effects

 ## Without survey weights

ATT.nn <- wtd.t.test(matched.data1$Y[matched.data1$T == 1], matched.data1$Y[matched.data1$T == 0], weight = matched.data1$weights[matched.data1$T == 1], weighty = matched.data1$weights[matched.data1$T == 0], alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

ATT.nn.reg <- summary(lm(Y ~ T + X, data = matched.data1, weights = weights))$coefficients[2,c(1,2)]

  ### Store results of nearest neighbor matching w/o survey weights

vATT.nn <- rbind(vATT.nn, ATT.nn)
vATT.nn.reg <- rbind(vATT.nn.reg, ATT.nn.reg)

 ## With survey weights

matched.data2$W2 <- matched.data2$W * matched.data2$weights

ATT.wnn <- wtd.t.test(matched.data2$Y[matched.data2$T == 1], matched.data2$Y[matched.data2$T == 0], weight = matched.data2$W2[matched.data2$T == 1], weighty = matched.data2$W2[matched.data2$T == 0], alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

ATT.wnn.reg <- summary(lm(Y ~ T + X, data = matched.data2, weights = W2))$coefficients[2,c(1,2)]

  ### Store results of nearest neighbor matching w/ survey weights

vATT.wnn <- rbind(vATT.wnn, ATT.wnn)
vATT.wnn.reg <- rbind(vATT.wnn.reg, ATT.wnn.reg)

#-------------------------------------------#
#------- Subclassification Matching  -------#
#-------------------------------------------#

# Matching

 ## Without survey weights

m.out1 <- matchit(T ~ X,distance="logit",data=data.sims[[j]],method="subclass",discard=discard.string, subclass = n.subclass, sub.by = "all")

matched.data1 <- match.data(m.out1)

 ## With survey weights

m.out2 <- matchit(T ~ X + W,distance="logit",data=data.sims[[j]],method="subclass",discard=discard.string, subclass = n.subclass, sub.by = "all")

matched.data2 <- match.data(m.out2)

# Subclass weights

 ## Without survey weights

subclass.weights1 <- aggregate(T ~ subclass, data = subset(matched.data1), FUN = sum)$T / sum(matched.data1$T)

 ## With survey weights

subclass.weights2 <- aggregate((T * W) ~ subclass, data = subset(matched.data2), FUN = sum)[,"(T * W)"]/sum(matched.data2$T * matched.data2$W)

# Treatment effects

 ## Without survey weights

  ### Without regression adjustment

ATT.s <- wtd.t.test(matched.data1$Y[matched.data1$T == 1], matched.data1$Y[matched.data1$T == 0], weight = matched.data1$weights[matched.data1$T == 1], weighty = matched.data1$weights[matched.data1$T == 0], alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

  ### With regression adjustment

ATT.s.reg.list <- list()

for (s in 1:n.subclass) {
  ATT.s.reg.list[[s]] <- summary(lm(Y ~ T + X, data = matched.data1[matched.data1$subclass == s,]))$coefficients[2,c(1,2)]
}

ATT.s.reg <- NULL
ATT.s.reg[1] <- weighted.mean(matrix(unlist(ATT.s.reg.list), ncol = 2, byrow = TRUE)[,1], w = subclass.weights1)
ATT.s.reg[2] <- sqrt(weighted.mean((matrix(unlist(ATT.s.reg.list), ncol = 2, byrow = TRUE)[,2])^2, w = subclass.weights1))

  ### Store results of subclassif matching w/o survey weights

vATT.s <- rbind(vATT.s, ATT.s)
vATT.s.reg <- rbind(vATT.s.reg, ATT.s.reg)

 ## With survey weights

matched.data2$W2 <- matched.data2$W * matched.data2$weights

  ### Without regression adjustment

ATT.ws <- wtd.t.test(matched.data2$Y[matched.data2$T == 1], matched.data2$Y[matched.data2$T == 0], weight = matched.data2$W2[matched.data2$T == 1], weighty = matched.data2$W2[matched.data2$T == 0], alternative="two.tailed", samedata = FALSE)$additional[c(1,4)]

  ### With regression adjustment

ATT.ws.reg.list <- list()

for (s in 1:n.subclass) {
  ATT.ws.reg.list[[s]] <- summary(lm(Y ~ T + X, data = matched.data2[matched.data2$subclass == s,], weights = W2))$coefficients[2,c(1,2)]
}

ATT.ws.reg <- NULL
ATT.ws.reg[1] <- weighted.mean(matrix(unlist(ATT.ws.reg.list), ncol = 2, byrow = TRUE)[,1], w = subclass.weights2)
ATT.ws.reg[2] <- sqrt(weighted.mean((matrix(unlist(ATT.ws.reg.list), ncol = 2, byrow = TRUE)[,2])^2, w = subclass.weights2))

  ### Store results of subclassif matching w/ survey weights

vATT.ws <- rbind(vATT.ws, ATT.ws)
vATT.ws.reg <- rbind(vATT.ws.reg, ATT.ws.reg)

}

#--------------------------------------------------#
#------- Store results of entire procedure  -------#
#--------------------------------------------------#

sim.results.miss[[k]] <- list("true effect" = true.effect, "vATE.naive" = vATE.naive, "vATE.wnaive" = vATE.wnaive, "vATT.weight" = vATT.weight, "vATT.wweight" = vATT.wweight, "vATT.weight.reg" = vATT.weight.reg, "vATT.wweight.reg" = vATT.wweight.reg, "vATT.nn" = vATT.nn, "vATT.wnn" = vATT.wnn, "vATT.nn.reg" = vATT.nn.reg, "vATT.wnn.reg" = vATT.wnn.reg,  "vATT.s" = vATT.s, "vATT.ws" = vATT.ws, "vATT.s.reg" = vATT.s.reg, "vATT.ws.reg" = vATT.ws.reg)

}

save(sim.results.miss, file = "sim_results_miss2.Rdata")
